source("arepa.R")
years <- 1990:2001 # possible range 1990-2013
within_km <- 9.656# 6 miles #####   4.828 #3 miles #####
observation_percent <- 67 # as per EPA suggestion
parameter_code <- 81102 # for PM 10
list_PM <- lapply(years, load_annual_average)
list_PM_10 <- lapply(list_PM,
source("arepa.R")
)
source("arepa.R")
install.packages("fields")
source("arepa.R")
install.packages("zoo")
source("arepa.R")
install.packages("zipcode")
source("arepa.R")
years <- 1990:2001 # possible range 1990-2013
within_km <- 9.656# 6 miles #####   4.828 #3 miles #####
observation_percent <- 67 # as per EPA suggestion
parameter_code <- 81102 # for PM 10
list_PM <- lapply(years, load_annual_average)
list_PM_10 <- lapply(list_PM,
function(dataset) subset(dataset, Parameter.Code == parameter_code))
PM <- rbindlist(list_PM_10)
# write.csv(PM, "PM.csv")
# Create monitor ID
PM[, Monitor := paste(sprintf("%02d", as.numeric(State.Code)),
sprintf("%03d", as.numeric(County.Code)),
"-",
sprintf("%04d", as.numeric(Site.Num)), sep = "")]
##----- Unique monitors: 3047
# unique(PM, by = "Monitor")
##----- Remove MICROSCALE monitors and valid days >= Valid.Day.Count
PM <- subset_monitors(MONITORS = PM,
parameter_code = parameter_code,
observation_percent = observation_percent)
list_PM <- lapply(years, load_annual_average)
list_PM_10 <- lapply(list_PM,
function(dataset) subset(dataset, Parameter.Code == parameter_code))
PM <- rbindlist(list_PM_10)
# write.csv(PM, "PM.csv")
# Create monitor ID
PM[, Monitor := paste(sprintf("%02d", as.numeric(State.Code)),
sprintf("%03d", as.numeric(County.Code)),
"-",
sprintf("%04d", as.numeric(Site.Num)), sep = "")]
##----- Unique monitors: 3047
# unique(PM, by = "Monitor")
##----- Remove MICROSCALE monitors and valid days >= Valid.Day.Count
PM <- subset_monitors(MONITORS = PM,
parameter_code = parameter_code,
observation_percent = observation_percent)
##----- Zip codes
ZIP <- get_zip_codes()
Link_PM_Zip_Index <- spatial_link_index(PM, "Latitude", "Longitude", "Monitor",
ZIP, "Latitude.zip", "Longitude.zip", "zip",
within = within_km)
Link_PM_Zip_Index_Unique <- copy(Link_PM_Zip_Index)
Link_PM_Zip_Index_Unique[, Include.in.Average := 0]
setkeyv(Link_PM_Zip_Index_Unique, c("Distance"))
setkeyv(Link_PM_Zip_Index_Unique, c("zip"))
Link_PM_Zip_Index_Unique[Link_PM_Zip_Index_Unique[, .I[1], by = key(Link_PM_Zip_Index_Unique)]$V1,
Include.in.Average := 1]
Link_PM_Zip_Index_Unique <- subset(Link_PM_Zip_Index_Unique, Include.in.Average == 1)
Link_PM_Zip_Index_Unique[, Include.in.Average := NULL]
MEDICARE <- fread("Medicare_annual_PM10_zipcode_simulated_6mile.csv")[, V1 := NULL]
MEDICARE$Year = MEDICARE$year
MEDICARE <- MEDICARE[!is.na(Year)]
MEDICARE <- MEDICARE[!is.na("Monitor")]
MEDICARE <- MEDICARE[, list(mean_age = mean(mean_age, na.rm = TRUE),
Female_rate = mean(Female_rate, na.rm = TRUE),
White_rate = mean(White_rate, na.rm = TRUE),
Black_rate = mean(Black_rate, na.rm = TRUE),
Total_death = sum(Total_death, na.rm = TRUE),
Tot_Beneficiary = sum(Tot_Beneficiary, na.rm = TRUE),
pyear = sum(pyear, na.rm = TRUE),
ALL_CVD = sum(ALL_CVD, na.rm = TRUE),
AMI = sum(AMI, na.rm = TRUE),
COPD = sum(COPD, na.rm = TRUE),
CV_stroke = sum(CV_stroke, na.rm = TRUE),
HF = sum(HF, na.rm = TRUE),
HRD = sum(HRD, na.rm = TRUE),
IHD = sum(IHD, na.rm = TRUE),
PVD = sum(PVD, na.rm = TRUE),
RTI = sum(RTI, na.rm = TRUE),
HRP = sum(HRP, na.rm = TRUE),
Respiratory = sum(Respiratory, na.rm = TRUE),
Nb_zip = length(Zipcode_listed_R)),
by = c("Year", "Monitor")]
##----- Merge back
PM_MEDICARE_LINKED <- merge(PM, MEDICARE, by = c("Monitor", "Year"), all = TRUE, allow.cartesian = TRUE)
names(PM_MEDICARE_LINKED)
write.csv(PM_MEDICARE_LINKED, "Merge_6mile.csv")
library(data.table)
library(maps)
dat = as.data.frame(fread(paste(dir, 'Merge_6mile.csv', sep="")))
dat = as.data.frame(fread('Merge_6mile.csv', sep=""))
dat = as.data.frame(fread('Merge_6mile.csv'))
dat$FIPS = substr(dat$Monitor, 1, 5)
attaindat=read.csv('EPA Nonattainment Table.csv')
attaindat$FIPS=paste(formatC(attaindat$fips_state, width=2, flag="0"), formatC(attaindat$fips_cnty, width=3, flag="0"), sep="")
attaindat=subset(attaindat, pollutant=='PM-10')
whichyrs = 1991:1995
attainyrs = as.vector(attaindat[, paste("pw_", whichyrs, sep="")])
attaindat$a = ifelse(apply( attainyrs =="P" | attainyrs =="W", 1, sum)>0, 1, 0) #1=attainment in 1990-1995, 0=else
whichyrs = 1991:1995
attainyrs = as.vector(attaindat[, paste("pw_", whichyrs, sep="")])
attaindat$a = ifelse(apply( attainyrs =="P" | attainyrs =="W", 1, sum)>0, 1, 0) #1=attainment in 1990-1995, 0=else
attaindat=attaindat[, (names(attaindat) %in% c("FIPS", "a"))]
censdat=read.table(paste(dir,'Cory-Census-2000-04-11-2011', sep=""), sep=",", header=TRUE)
censdat=read.table('Cory-Census-2000-04-11-2011', sep=",", header=TRUE)
names(censdat)[names(censdat)=="Master_county"]="FIPS"
names(censdat)[names(censdat)=="white_rate"]="White_cens"
names(censdat)[names(censdat)=="black_rate"]="Black_cens"
names(censdat)[names(censdat)=="Other_rate"]="Other_cens"
names(censdat)[names(censdat)=="Hispanic_rate"]="Hispanic_cens"
names(censdat)[names(censdat)=="Female_rate"]="Female_cens"
censdat$FIPS = as.character(formatC(censdat$FIPS, format="d", width=5, flag="0"))
weather=read.csv('Annual-weather-11-05-2012.csv', sep=",", header=TRUE)
weather=subset(weather, year==1990)
weather$FIPS = as.character(formatC(weather$FIPS, format="d", width=5, flag="0"))
d1 = merge(dat, attaindat, by="FIPS", all.x=TRUE)
d1$a[is.na(d1$a)] = 0
d2 = merge(d1, censdat, by="FIPS", all.x=TRUE)
d3 = merge(d2, weather, by = "FIPS", all.x=TRUE)
alldat = d3
save(alldat, file = 'alldata.RData')
library(maps)
load('alldata.RData')
alldat$Arithmetic.Mean = as.numeric(alldat$Arithmetic.Mean)
dat = alldat
Years = 1990:2010
dat = subset(dat, Year %in% Years)
## Select only the variables that will be used
dat = dat[, names(dat) %in% c("FIPS", "Year", "Monitor", "Latitude", "Longitude", "Parameter.Name", "Sample.Duration",
"Metric.Used", "State.Name", "County.Name",
"Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile",
"mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",
"AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory", "a",
"Median_income", "HS_rate", "Urban_rate", "Migration_5_year_rate", "Hispanic_cens", "White_cens", "Black_cens",
"Other_cens", "Current_Smoking_rate_weighted", "HOUSEUNIT", "POPULATION", "Female_cens", "county_ID",
"Dew_point_annual_mean", "Dew_point_annual_STD", "temperature_annual_mean", "temperature_annual_STD",
"temperature_annual_max_mean", "temperature_annual_min_mean", "Weather_total_site")]
## Reshape
datwide = reshape(dat, direction = "wide",
v.names = c(  "mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",
"AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory",
"Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile",
"Latitude", "Longitude", "Sample.Duration", "Metric.Used"),
timevar = "Year", idvar = "Monitor")
getlast = function(x){
return(x[max(which(!is.na(x)))])
}
datwide$Longitude = apply(datwide[, paste("Longitude.", Years, sep="")], 1, getlast)
datwide$Latitude = apply(datwide[, paste("Latitude.", Years, sep="")], 1, getlast)
datwide = datwide[, !(names(datwide) %in% paste("Longitude.", Years, sep=""))]
datwide = datwide[, !(names(datwide) %in% paste("Latitude.", Years, sep=""))]
#Baseline and follow up PM
makeavgvar=function(d,varprefix, years, whichdata){
tempmat=d[, names(datwide) %in% paste(varprefix,".", years,whichdata, sep="") ]
return(rowMeans(tempmat, na.rm=TRUE))
}
## Baseline PM10 - use 1990
datwide$pm10base1990 = datwide$Arithmetic.Mean.1990
## Baseline PM10 - use 1990 - 1994
datwide$pm10base1990_1994 = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1990", "1991", "1992", "1993", "1994"), "")
datwide$pm10fu = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1999", "2000", "2001"), "")
## Follow up PM10 - use average 95th %tile from 1999-2001
datwide$pm10fu95 = makeavgvar(d=datwide, varprefix="X95th.Percentile", years=c("1999", "2000", "2001"), "")
datwide$housedens=datwide$HOUSEUNIT/datwide$POPULATION
datwide$loginc=log(datwide$Median_income)
datwide$logpop=log(datwide$POPULATION)
datwide$lhousedens = log(datwide$housedens)
west = subset(datwide, Latitude > -130 & Longitude < -100 & Longitude > -130)
west_restrict = subset(west, (!is.na(pm10base1990_1994) | !is.na(pm10fu)) & ( !is.na(Median_income) & !is.na(Current_Smoking_rate_weighted)))
datforimpute = west_restrict
save(datforimpute, file=paste(dir, 'datforimpute.RData', sep=""))
save(datforimpute, file = 'datforimpute.RData')
library(maps)
load('alldata.RData')
alldat$Arithmetic.Mean = as.numeric(alldat$Arithmetic.Mean)
dat = alldat
Years = 1990:2010
dat = subset(dat, Year %in% Years)
## Select only the variables that will be used
dat = dat[, names(dat) %in% c("FIPS", "Year", "Monitor", "Latitude", "Longitude", "Parameter.Name", "Sample.Duration",
"Metric.Used", "State.Name", "County.Name",
"Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile",
"mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",
"AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory", "a",
"Median_income", "HS_rate", "Urban_rate", "Migration_5_year_rate", "Hispanic_cens", "White_cens", "Black_cens",
"Other_cens", "Current_Smoking_rate_weighted", "HOUSEUNIT", "POPULATION", "Female_cens", "county_ID",
"Dew_point_annual_mean", "Dew_point_annual_STD", "temperature_annual_mean", "temperature_annual_STD",
"temperature_annual_max_mean", "temperature_annual_min_mean", "Weather_total_site")]
## Reshape
datwide = reshape(dat, direction = "wide",
v.names = c(  "mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",
"AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory",
"Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile",
"Latitude", "Longitude", "Sample.Duration", "Metric.Used"),
timevar = "Year", idvar = "Monitor")
getlast = function(x){
return(x[max(which(!is.na(x)))])
}
datwide$Longitude = apply(datwide[, paste("Longitude.", Years, sep="")], 1, getlast)
datwide$Latitude = apply(datwide[, paste("Latitude.", Years, sep="")], 1, getlast)
datwide = datwide[, !(names(datwide) %in% paste("Longitude.", Years, sep=""))]
datwide = datwide[, !(names(datwide) %in% paste("Latitude.", Years, sep=""))]
#Baseline and follow up PM
makeavgvar=function(d,varprefix, years, whichdata){
tempmat=d[, names(datwide) %in% paste(varprefix,".", years,whichdata, sep="") ]
return(rowMeans(tempmat, na.rm=TRUE))
}
## Baseline PM10 - use 1990
datwide$pm10base1990 = datwide$Arithmetic.Mean.1990
## Baseline PM10 - use 1990 - 1994
datwide$pm10base1990_1994 = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1990", "1991", "1992", "1993", "1994"), "")
## Follow up PM10 - use average of 1999-2001
datwide$pm10fu = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1999", "2000", "2001"), "")
## Follow up PM10 - use average 95th %tile from 1999-2001
datwide$pm10fu95 = makeavgvar(d=datwide, varprefix="X95th.Percentile", years=c("1999", "2000", "2001"), "")
datwide$housedens=datwide$HOUSEUNIT/datwide$POPULATION
datwide$loginc=log(datwide$Median_income)
datwide$logpop=log(datwide$POPULATION)
datwide$lhousedens = log(datwide$housedens)
west = subset(datwide, Latitude > -130 & Longitude < -100 & Longitude > -130)
west_restrict = subset(west, (!is.na(pm10base1990_1994) | !is.na(pm10fu)) & ( !is.na(Median_income) & !is.na(Current_Smoking_rate_weighted)))
datforimpute = west_restrict
save(datforimpute, file = 'datforimpute.RData')
library(HEIfunctions)
library(spBayes)
library(splines)
library(xtable)
library(pscl)
library(DMwR)
set.seed(51)
install.packages("pscl")
load('datforimpute.RData')
dat = datforimpute
install.packages("DMwr")
y
install.packages("DMwR")
library(pscl)
library(DMwR)
indices <- which(is.na(dat$temperature_annual_max_mean) == T)
imputetemp <- knnImputation(cbind(dat$temperature_annual_max_mean, dat$Longitude, dat$Latitude), 3)
dat$temperature <- dat$temperature_annual_max_mean
dat$temperature[indices] <- imputetemp[indices,1]
dat$temporig = dat$temperature_annual_max_mean  ## this line added on 9/4/2015 when recreating to prevent a later (insignificant) error
dat$pm10base = dat$pm10base1990
obsdat 	<- dat[which(is.na(dat$pm10base)==F),]
misdat 	<- dat[which(is.na(dat$pm10base)==T),]
varprior 	<- c(.15,1)
samp <- sqrt(rigamma(10000,varprior[1], varprior[2]))
obscoords 	<- cbind(obsdat$Longitude, obsdat$Latitude)
miscoords 	<- cbind(misdat$Longitude, misdat$Latitude)
phiest 	<- makephi(obscoords, scale=10) #1.12531
phiprior 	<- c(makephi(obscoords, scale=3), makephi(obscoords, scale=50)) #(.3376, 5.6265)
load('datforimpute_withtemp.Rda')
dat$baseorig1990_1994 = dat$pm10base1990_1994
dat$baseorig1990 = dat$pm10base1990
load(file = "20150427pm1990preds.Rda")
nsamp    <- 50000
nburn		<- 10000
obsdat = subset(dat, !is.na(pm10base1990))
misdat = subset(dat, is.na(pm10base1990))
whichpostpreds = sample(nburn:nsamp, size=5)
imputations = cbind(outmodel$pred[, -1], outmodel$predmodel$p.y.predictive.samples[, whichpostpreds])
impdat = as.data.frame(cbind(as.character(outmodel$pred$Monitor), imputations))
names(impdat) = c("Monitor", "predmean1990", "predsd1990", paste("ppbase1990", 1:length(whichpostpreds), sep=""))
datmerged = merge(dat, impdat, by="Monitor", all.x=TRUE)
with(datmerged, table(is.na(baseorig1990), is.na(predmean1990)))
datmerged$pm10base1990[is.na(datmerged$pm10base1990)] = datmerged$predmean1990[is.na(datmerged$pm10base1990)]
dat = datmerged
save(dat, file = 'pm10dat_withbaselineimp.RData')
library(maps)
load('pm10dat_withbaselineimp.RData')
datorig = dat
with(datorig, table(is.na(Tot_Beneficiary.2001)))
with(datorig, table(Tot_Beneficiary.2001 >= 20))
dat = subset(datorig, Tot_Beneficiary.2001>=20) ## 547 locations
dim(dat)
map("state", xlim=c(-125,-100), ylim=c(25,50))
with(dat, points(Longitude, Latitude , col=a+1, pch=16))
dim(dat)
with(dat, table(a))
medyears = 1999:2010
aqsyears = 1990:2014
dat = dat[, names(dat) %in% c("Monitor", "FIPS", "State.Name", "County.Name", "a", "Median_income", "HS_rate", "Urban_rate", "Migration_5_year_rate",
"Hispanic_cens", "White_cens", "Black_cens", "Other_cens", "Current_Smoking_rate_weighted", "HOUSEUNIT", "POPULATION",
"Female_cens", paste("mean_age", medyears, sep="."), paste("Female_rate", medyears, sep="."),
paste("White_rate", medyears, sep="."), paste("Black_rate", medyears, sep="."),
paste("Total_death", medyears, sep="."), paste("Tot_Beneficiary", medyears, sep="."), paste("pyear", medyears, sep="."),
paste("ALL_CVD", medyears, sep="."), paste("AMI", medyears, sep="."), paste("COPD", medyears, sep="."),
paste("CV_stroke", medyears, sep="."), paste("HF", medyears, sep="."), paste("HRD", medyears, sep="."),
paste("IHD", medyears, sep="."), paste("PVD", medyears, sep="."), paste("RTI", medyears, sep="."),
paste("HRP", medyears, sep="."), paste("Respiratory", medyears, sep="."),
paste("Arithmetic.Mean", aqsyears, sep="."), paste("X98th.Percentile", aqsyears, sep="."),
paste("X95th.Percentile", aqsyears, sep="."), paste("X90th.Percentile", aqsyears, sep="."),
"Longitude", "Latitude", "pm10base1990", "baseorig1990", "pm10base1990_1994", "baseorig1990_1994", "pm10fu", "pm10fu95",
"temperature", "temporig", "pm10baseorig", "housedens", "loginc", "logpop", "lhousedens")]
save(dat, file=paste(dir, 'pm10analysis.RData', sep=""))
save(dat, file = 'pm10analysis.RData')
